In [ ]:
import pandas as pd
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
  1. Load the Dataset
In [ ]:
# Load the dataset
df = pd.read_csv('/content/insurance.csv')

# Display the first few rows of the dataset
df.head()
Out[ ]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
  1. Data Preprocessing
In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib  # For loading and saving models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Step 1: Load and preprocess the dataset
df = pd.read_csv('/content/insurance.csv')

# One-hot encode categorical variables (sex, smoker, region)
df_encoded = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# Prepare data for modeling
X = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 2: Train and Save Models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Train and save each model
for name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f'final_{name.lower().replace(" ", "_")}_model.pkl')

# Step 3: Load and Test the Saved Models
# Load the saved models
try:
    loaded_rf_model = joblib.load('final_random_forest_model.pkl')
    loaded_gb_model = joblib.load('final_gradient_boosting_model.pkl')
    loaded_xgb_model = joblib.load('final_xgboost_model.pkl')

    # Make predictions
    test_rf_pred = loaded_rf_model.predict(X_test)
    test_gb_pred = loaded_gb_model.predict(X_test)
    test_xgb_pred = loaded_xgb_model.predict(X_test)

    # Evaluate the performance of the loaded models
    def evaluate_model(predictions, true_values, model_name):
        mae = mean_absolute_error(true_values, predictions)
        rmse = np.sqrt(mean_squared_error(true_values, predictions))
        r2 = r2_score(true_values, predictions)
        print(f"\n{model_name} Performance:")
        print(f"Mean Absolute Error: {mae:.2f}")
        print(f"Root Mean Squared Error: {rmse:.2f}")
        print(f"R-Squared: {r2:.2f}")

    evaluate_model(test_rf_pred, y_test, "Random Forest")
    evaluate_model(test_gb_pred, y_test, "Gradient Boosting")
    evaluate_model(test_xgb_pred, y_test, "XGBoost")
except FileNotFoundError as e:
    print(f"Error: {e}")
Random Forest Performance:
Mean Absolute Error: 2667.15
Root Mean Squared Error: 4657.03
R-Squared: 0.85

Gradient Boosting Performance:
Mean Absolute Error: 2489.00
Root Mean Squared Error: 4435.72
R-Squared: 0.87

XGBoost Performance:
Mean Absolute Error: 2815.97
Root Mean Squared Error: 4908.25
R-Squared: 0.84
In [ ]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the Dataset
dataset_path = '/content/insurance.csv'  # Update this path as needed
df = pd.read_csv(dataset_path)

# Step 2: Inspect the Dataset
print(f"Total number of records: {len(df)}")
print(f"Column headers: {df.columns.tolist()}")
print(df.head())

# Step 3: Data Preprocessing
# 3.1 Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# 3.2 Encode categorical variables
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# 3.3 Feature Scaling (Normalization)
scaler = StandardScaler()
features = ['age', 'bmi', 'children']
df[features] = scaler.fit_transform(df[features])

# Step 4: Define Features and Target Variable
X = df.drop('charges', axis=1)
y = df['charges']

# Step 5: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 6: Model Selection and Training
# 6.1 RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# 6.2 GradientBoostingRegressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

# Step 7: Model Evaluation on Test Data
# 7.1 RandomForestRegressor Predictions
rf_pred = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

# 7.2 GradientBoostingRegressor Predictions
gb_pred = gb_model.predict(X_test)
gb_mae = mean_absolute_error(y_test, gb_pred)
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_pred))
gb_r2 = r2_score(y_test, gb_pred)

print("\nRandom Forest Metrics:")
print(f"MAE: {rf_mae}")
print(f"RMSE: {rf_rmse}")
print(f"R²: {rf_r2}")

print("\nGradient Boosting Metrics:")
print(f"MAE: {gb_mae}")
print(f"RMSE: {gb_rmse}")
print(f"R²: {gb_r2}")

# Step 8: Cross-Validation for Stability Check
rf_cv_scores = cross_val_score(rf_model, X, y, cv=10, scoring='r2')
gb_cv_scores = cross_val_score(gb_model, X, y, cv=10, scoring='r2')

print("\nRandom Forest Cross-Validation R² Scores:")
print(rf_cv_scores)
print(f"Mean R²: {rf_cv_scores.mean()}")

print("\nGradient Boosting Cross-Validation R² Scores:")
print(gb_cv_scores)
print(f"Mean R²: {gb_cv_scores.mean()}")

# Step 9: Visualizations

# 9.1 Dataset Analysis
# Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True, bins=20)
plt.title('Age Distribution')
plt.savefig('age_distribution.png')
plt.show()

# BMI distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['bmi'], kde=True, bins=20)
plt.title('BMI Distribution')
plt.savefig('bmi_distribution.png')
plt.show()

# Charges distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], kde=True, bins=20)
plt.title('Charges Distribution')
plt.savefig('charges_distribution.png')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.show()

# 9.2 Model Performance Visualizations
# Actual vs Predicted Plot for Random Forest
plt.figure(figsize=(10, 6))
plt.scatter(y_test, rf_pred, alpha=0.6, label='Random Forest')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges (Random Forest)')
plt.legend()
plt.savefig('rf_actual_vs_predicted.png')
plt.show()

# Actual vs Predicted Plot for Gradient Boosting
plt.figure(figsize=(10, 6))
plt.scatter(y_test, gb_pred, alpha=0.6, label='Gradient Boosting')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.title('Actual vs Predicted Charges (Gradient Boosting)')
plt.legend()
plt.savefig('gb_actual_vs_predicted.png')
plt.show()

# Residual Plot for Random Forest
rf_residuals = y_test - rf_pred
plt.figure(figsize=(10, 6))
sns.histplot(rf_residuals, kde=True)
plt.title('Distribution of Residuals (Random Forest)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.savefig('rf_residual_distribution.png')
plt.show()

# Residual Plot for Gradient Boosting
gb_residuals = y_test - gb_pred
plt.figure(figsize=(10, 6))
sns.histplot(gb_residuals, kde=True)
plt.title('Distribution of Residuals (Gradient Boosting)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.savefig('gb_residual_distribution.png')
plt.show()

# 9.3 Learning Curves for both models
train_sizes, rf_train_scores, rf_test_scores = learning_curve(
    rf_model, X, y, cv=10, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1, 1.0, 10)
)
train_sizes, gb_train_scores, gb_test_scores = learning_curve(
    gb_model, X, y, cv=10, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1, 1.0, 10)
)

rf_train_scores_mean = -np.mean(rf_train_scores, axis=1)
rf_test_scores_mean = -np.mean(rf_test_scores, axis=1)
gb_train_scores_mean = -np.mean(gb_train_scores, axis=1)
gb_test_scores_mean = -np.mean(gb_test_scores, axis=1)

# Learning curve for Random Forest
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, rf_train_scores_mean, 'o-', color='r', label='Training error (RF)')
plt.plot(train_sizes, rf_test_scores_mean, 'o-', color='g', label='Validation error (RF)')
plt.xlabel('Training Size')
plt.ylabel('Error')
plt.title('Learning Curve (Random Forest)')
plt.legend(loc='best')
plt.savefig('rf_learning_curve.png')
plt.show()

# Learning curve for Gradient Boosting
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, gb_train_scores_mean, 'o-', color='r', label='Training error (GB)')
plt.plot(train_sizes, gb_test_scores_mean, 'o-', color='g', label='Validation error (GB)')
plt.xlabel('Training Size')
plt.ylabel('Error')
plt.title('Learning Curve (Gradient Boosting)')
plt.legend(loc='best')
plt.savefig('gb_learning_curve.png')
plt.show()

# Step 10: Save the Models
import joblib
joblib.dump(rf_model, 'final_rf_model.pkl')
joblib.dump(gb_model, 'final_gb_model.pkl')

# Step 11: Load and Test the Saved Models (Optional)
# Load the saved models
loaded_rf_model = joblib.load('final_random_forest_model.pkl')
loaded_gb_model = joblib.load('final_gradient_boosting_model.pkl')
loaded_xgb_model = joblib.load('final_xgboost_model.pkl')

# Make predictions
test_rf_pred = loaded_rf_model.predict(X_test)
test_gb_pred = loaded_gb_model.predict(X_test)
test_xgb_pred = loaded_xgb_model.predict(X_test)

# Evaluate the performance of the loaded models
def evaluate_model(predictions, true_values, model_name):
    mae = mean_absolute_error(true_values, predictions)
    rmse = np.sqrt(mean_squared_error(true_values, predictions))
    r2 = r2_score(true_values, predictions)
    print(f"\n{model_name} Performance:")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R-Squared: {r2:.2f}")

evaluate_model(test_rf_pred, y_test, "Random Forest")
evaluate_model(test_gb_pred, y_test, "Gradient Boosting")
evaluate_model(test_xgb_pred, y_test, "XGBoost")
Total number of records: 1338
Column headers: ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
Missing values in each column:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

Random Forest Metrics:
MAE: 2664.9715886942795
RMSE: 4634.451593997286
R²: 0.8535158195419218

Gradient Boosting Metrics:
MAE: 2490.6412888151885
RMSE: 4438.103389963076
R²: 0.8656651002627456

Random Forest Cross-Validation R² Scores:
[0.86423437 0.84087091 0.81698301 0.72623614 0.85542821 0.88821693
 0.85690537 0.79614713 0.84282273 0.85853391]
Mean R²: 0.8346378711809906

Gradient Boosting Cross-Validation R² Scores:
[0.88636658 0.8687801  0.8403614  0.74483395 0.86702461 0.92481761
 0.8810778  0.82014416 0.86095864 0.86414137]
Mean R²: 0.8558506219727745
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
Random Forest Performance:
Mean Absolute Error: 7604.09
Root Mean Squared Error: 11604.48
R-Squared: 0.08

Gradient Boosting Performance:
Mean Absolute Error: 8556.39
Root Mean Squared Error: 12249.33
R-Squared: -0.02

XGBoost Performance:
Mean Absolute Error: 8953.14
Root Mean Squared Error: 12489.64
R-Squared: -0.06
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
df = pd.read_csv('/content/insurance.csv')

# 1. Distribution of Charges
plt.figure(figsize=(10, 6))
sns.histplot(df['charges'], kde=True, bins=30)
plt.title('Distribution of Healthcare Costs (Charges)')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.savefig('distribution_of_charges.png')
plt.show()

# 2. Charges by Age Group
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='charges', data=df)
plt.title('Healthcare Costs by Age')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.savefig('charges_by_age.png')
plt.show()

# 3. Charges by BMI
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bmi', y='charges', data=df)
plt.title('Healthcare Costs by BMI')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.savefig('charges_by_bmi.png')
plt.show()

# 4. Charges by Smoking Status
# Check if 'smoker' column exists, else use 'smoker_yes' if it was one-hot encoded
if 'smoker' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='smoker', y='charges', data=df)
    plt.title('Healthcare Costs by Smoking Status')
    plt.xlabel('Smoker')
    plt.ylabel('Charges')
    plt.savefig('charges_by_smoking_status.png')
    plt.show()
else:
    # Assuming 'smoker_yes' and 'smoker_no' exist after get_dummies
    df['smoker'] = df['smoker_yes'] if 'smoker_yes' in df.columns else df['smoker_no']
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='smoker', y='charges', data=df)
    plt.title('Healthcare Costs by Smoking Status')
    plt.xlabel('Smoker')
    plt.ylabel('Charges')
    plt.savefig('charges_by_smoking_status.png')
    plt.show()

# 5. Charges by Region
# If region was one-hot encoded, this will plot each region separately
if 'region' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='region', y='charges', data=df)
    plt.title('Healthcare Costs by Region')
    plt.xlabel('Region')
    plt.ylabel('Charges')
    plt.savefig('charges_by_region.png')
    plt.show()
else:
    # Assuming regions were one-hot encoded, plot each as a separate series
    plt.figure(figsize=(10, 6))
    regions = [col for col in df.columns if 'region_' in col]
    for region in regions:
        sns.boxplot(y='charges', data=df[df[region] == 1], label=region)
    plt.title('Healthcare Costs by Region')
    plt.xlabel('Region')
    plt.ylabel('Charges')
    plt.legend()
    plt.savefig('charges_by_region.png')
    plt.show()

# 6. Charges by Number of Children
plt.figure(figsize=(10, 6))
sns.boxplot(x='children', y='charges', data=df)
plt.title('Healthcare Costs by Number of Children')
plt.xlabel('Number of Children')
plt.ylabel('Charges')
plt.savefig('charges_by_children.png')
plt.show()

# 7. Interaction Effect: Charges by Age and Smoking Status
if 'smoker' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.lmplot(x='age', y='charges', hue='smoker', data=df, aspect=2, height=6, ci=None, palette='muted')
    plt.title('Interaction Effect: Age and Smoking Status on Healthcare Costs')
    plt.xlabel('Age')
    plt.ylabel('Charges')
    plt.savefig('interaction_age_smoker.png')
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
<Figure size 1000x600 with 0 Axes>
No description has been provided for this image
In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/content/insurance.csv')

# One-hot encode categorical variables (sex, smoker, region)
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True)

# Check the columns to confirm the correct ones for region
print("Columns after encoding:", df.columns)

# Set up the matplotlib figure
plt.figure(figsize=(16, 12))

# 1. Charges by Age
plt.subplot(2, 3, 1)
sns.scatterplot(x='age', y='charges', data=df)
plt.title('Healthcare Costs by Age')
plt.xlabel('Age')
plt.ylabel('Charges')

# 2. Charges by BMI
plt.subplot(2, 3, 2)
sns.scatterplot(x='bmi', y='charges', data=df)
plt.title('Healthcare Costs by BMI')
plt.xlabel('BMI')
plt.ylabel('Charges')

# 3. Charges by Smoking Status
plt.subplot(2, 3, 3)
sns.boxplot(x='smoker_yes', y='charges', data=df)
plt.title('Healthcare Costs by Smoking Status')
plt.xlabel('Smoker (1=Yes, 0=No)')
plt.ylabel('Charges')

# 4. Charges by Region
plt.subplot(2, 3, 4)
# Check which region columns exist
region_columns = [col for col in df.columns if col.startswith('region_')]
if region_columns:
    # Create a 'region' column to represent regions more clearly after one-hot encoding
    df['region'] = df[region_columns].idxmax(axis=1).apply(lambda x: x.split('_')[1].capitalize())
    sns.boxplot(x='region', y='charges', data=df)
    plt.title('Healthcare Costs by Region')
    plt.xlabel('Region')
    plt.ylabel('Charges')
else:
    print("No region columns found. Please check the encoding step.")

# 5. Charges by Number of Children
plt.subplot(2, 3, 5)
sns.boxplot(x='children', y='charges', data=df)
plt.title('Healthcare Costs by Number of Children')
plt.xlabel('Number of Children')
plt.ylabel('Charges')

# 6. Interaction: Age and Smoking Status
plt.subplot(2, 3, 6)
sns.scatterplot(x='age', y='charges', hue='smoker_yes', data=df, palette='muted')
plt.title('Interaction: Age and Smoking Status on Costs')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.legend(title='Smoker (1=Yes, 0=No)', loc='upper left')

# Adjust layout
plt.tight_layout()
plt.savefig('comparative_analysis.png')
plt.show()
Columns after encoding: Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')
No description has been provided for this image

convert .html file code

In [ ]:

Figure 1

figure 2

In [ ]:
import matplotlib.pyplot as plt
import numpy as np

# Performance metrics for Random Forest
rf_metrics = {
    'Mean Absolute Error (MAE)': 2667.15,
    'Root Mean Squared Error (RMSE)': 4657.03,
    'R-Squared (R²)': 0.85
}

# Plotting the performance metrics
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(rf_metrics.keys(), rf_metrics.values(), color=['skyblue', 'salmon', 'lightgreen'])

# Adding text annotations
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2.0, yval + 50, round(yval, 2), ha='center', va='bottom')

plt.title('Random Forest Regressor Performance')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_2_random_forest_performance.png')
plt.show()
No description has been provided for this image

Figure 3: Gradient Boosting Regressor Performance

In [ ]:
# Performance metrics for Gradient Boosting
gb_metrics = {
    'Mean Absolute Error (MAE)': 2489.00,
    'Root Mean Squared Error (RMSE)': 4435.72,
    'R-Squared (R²)': 0.87
}

# Plotting the performance metrics
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(gb_metrics.keys(), gb_metrics.values(), color=['skyblue', 'salmon', 'lightgreen'])

# Adding text annotations
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2.0, yval + 50, round(yval, 2), ha='center', va='bottom')

plt.title('Gradient Boosting Regressor Performance')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_3_gradient_boosting_performance.png')
plt.show()
No description has been provided for this image

Figure 4: XGBoost Regressor Performance

In [ ]:
# Performance metrics for XGBoost
xgb_metrics = {
    'Mean Absolute Error (MAE)': 2815.97,
    'Root Mean Squared Error (RMSE)': 4908.25,
    'R-Squared (R²)': 0.84
}

# Plotting the performance metrics
fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.bar(xgb_metrics.keys(), xgb_metrics.values(), color=['skyblue', 'salmon', 'lightgreen'])

# Adding text annotations
for bar in bars:
    yval = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2.0, yval + 50, round(yval, 2), ha='center', va='bottom')

plt.title('XGBoost Regressor Performance')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_4_xgboost_performance.png')
plt.show()
No description has been provided for this image

Figure 5: Random Forest Regressor Cross-Validation R² Scores

In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sample R² scores for Random Forest from cross-validation
rf_cv_r2_scores = [0.864, 0.841, 0.817, 0.726, 0.855, 0.888, 0.857, 0.796, 0.843, 0.859]

# Plotting the Random Forest Regressor cross-validation R² scores
plt.figure(figsize=(10, 6))
sns.boxplot(y=rf_cv_r2_scores, color='skyblue')
plt.title('Random Forest Regressor Cross-Validation R² Scores')
plt.ylabel('R² Score')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_5_rf_cv_r2_scores.png')
plt.show()
No description has been provided for this image

Figure 6: Gradient Boosting Regressor Cross-Validation R² Scores

In [ ]:
# Sample R² scores for Gradient Boosting from cross-validation
gb_cv_r2_scores = [0.886, 0.869, 0.840, 0.745, 0.867, 0.925, 0.881, 0.820, 0.861, 0.864]

# Plotting the Gradient Boosting Regressor cross-validation R² scores
plt.figure(figsize=(10, 6))
sns.boxplot(y=gb_cv_r2_scores, color='lightgreen')
plt.title('Gradient Boosting Regressor Cross-Validation R² Scores')
plt.ylabel('R² Score')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('figure_6_gb_cv_r2_scores.png')
plt.show()
No description has been provided for this image

Figure 4.2.1: Age Distribution

In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/content/insurance.csv')

# 4.2.1 Plotting the distribution of age
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], bins=20, kde=True, color='blue')
plt.title('Figure 4.2.1: Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image

Figure 4.2.2: BMI Distribution

In [ ]:
# Plotting the distribution of BMI
plt.figure(figsize=(10, 6))
sns.histplot(data['bmi'], bins=20, kde=True, color='orange')
plt.title('Figure 4.2.2: Distribution of BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_2_BMI_Distribution.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.3: Healthcare Costs Distribution

In [ ]:
# Plotting the distribution of healthcare costs
plt.figure(figsize=(10, 6))
sns.histplot(data['charges'], bins=20, kde=True, color='green')
plt.title('Figure 4.2.3: Distribution of Healthcare Costs')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_3_Healthcare_Costs.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.4: Correlation Matrix

In [ ]:
# Selecting only numeric columns
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Plotting the correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = numeric_data.corr()  # Compute correlation matrix only on numeric columns
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Figure 4.2.4: Correlation Matrix')
plt.xlabel('Features')
plt.ylabel('Features')
plt.grid(True)
plt.savefig('/content/Figure_4_2_4_Correlation_Matrix.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.5: Random Forest - Actual vs Predicted Charges

In [ ]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Preparing the data
X = data.drop('charges', axis=1)
y = data['charges']

# Encoding categorical features
X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Random Forest model
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)

# Predicting
y_pred_rf = model_rf.predict(X_test)

# Plotting actual vs predicted charges
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred_rf, color='purple')
plt.plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
plt.title('Figure 4.2.5: Actual vs Predicted Charges (Random Forest)')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_5_Actual_vs_Predicted_RF.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.6: Gradient Boosting - Actual vs Predicted Charges

In [ ]:
from sklearn.ensemble import GradientBoostingRegressor

# Train the Gradient Boosting model
model_gb = GradientBoostingRegressor()
model_gb.fit(X_train, y_train)

# Predicting
y_pred_gb = model_gb.predict(X_test)

# Plotting actual vs predicted charges
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred_gb, color='cyan')
plt.plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
plt.title('Figure 4.2.6: Actual vs Predicted Charges (Gradient Boosting)')
plt.xlabel('Actual Charges')
plt.ylabel('Predicted Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_6_Actual_vs_Predicted_GB.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.7: Random Forest - Residual Analysis

In [ ]:
# Calculating residuals
residuals_rf = y_test - y_pred_rf

# Plotting residuals distribution
plt.figure(figsize=(10, 6))
sns.histplot(residuals_rf, bins=20, kde=True, color='magenta')
plt.title('Figure 4.2.7: Residual Distribution (Random Forest)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_7_Residuals_RF.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.8: Gradient Boosting - Residual Analysis

In [ ]:
# Calculating residuals
residuals_gb = y_test - y_pred_gb

# Plotting residuals distribution
plt.figure(figsize=(10, 6))
sns.histplot(residuals_gb, bins=20, kde=True, color='brown')
plt.title('Figure 4.2.8: Residual Distribution (Gradient Boosting)')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_8_Residuals_GB.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.9: Random Forest - Learning Curve

In [ ]:
from sklearn.model_selection import learning_curve

# Plotting learning curve
plt.figure(figsize=(10, 6))
train_sizes, train_scores, test_scores = learning_curve(model_rf, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
plt.plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
plt.plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
plt.title('Figure 4.2.9: Learning Curve (Random Forest)')
plt.xlabel('Training Size')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.savefig('/content/Figure_4_2_9_Learning_Curve_RF.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.10: Gradient Boosting - Learning Curve

In [ ]:
# Plotting learning curve
plt.figure(figsize=(10, 6))
train_sizes, train_scores, test_scores = learning_curve(model_gb, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
plt.plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
plt.plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
plt.title('Figure 4.2.10: Learning Curve (Gradient Boosting)')
plt.xlabel('Training Size')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.savefig('/content/Figure_4_2_10_Learning_Curve_GB.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.11: Overall Distribution of Healthcare Costs

In [ ]:
# Plotting the overall distribution of healthcare costs
plt.figure(figsize=(10, 6))
sns.histplot(data['charges'], bins=30, kde=True, color='teal')
plt.title('Figure 4.2.11: Overall Distribution of Healthcare Costs')
plt.xlabel('Charges')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig('/content/Figure_4_2_11_Overall_Distribution_Costs.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.12: Relationship Between Age and Healthcare Costs

In [ ]:
# Plotting the relationship between age and healthcare costs
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['age'], y=data['charges'], color='red')
plt.title('Figure 4.2.12: Relationship Between Age and Healthcare Costs')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_12_Age_vs_Costs.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.13: Relationship Between BMI and Healthcare Costs

In [ ]:
# Plotting the relationship between BMI and healthcare costs
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data['bmi'], y=data['charges'], color='purple')
plt.title('Figure 4.2.13: Relationship Between BMI and Healthcare Costs')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_13_BMI_vs_Costs.png')  # Save the figure
plt.show()
No description has been provided for this image

Figure 4.2.14: Impact of Smoking on Healthcare Costs

In [ ]:
# Plotting the impact of smoking status on healthcare costs
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2')
plt.title('Figure 4.2.14: Impact of Smoking Status on Healthcare Costs')
plt.xlabel('Smoker')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_14_Smoking_Status_vs_Costs.png')  # Save the figure
plt.show()
<ipython-input-33-a14e15d04678>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2')
No description has been provided for this image

Figure 4.2.15: Regional Differences in Healthcare Costs

In [ ]:
# Plotting regional differences in healthcare costs
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['region'], y=data['charges'], palette='Set1')
plt.title('Figure 4.2.15: Regional Differences in Healthcare Costs')
plt.xlabel('Region')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_15_Regional_Differences.png')  # Save the figure
plt.show()
<ipython-input-34-ee4208271dbd>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=data['region'], y=data['charges'], palette='Set1')
No description has been provided for this image

Figure 4.2.16: Number of Children vs. Healthcare Costs

In [ ]:
# Plotting the relationship between number of children and healthcare costs
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1')
plt.title('Figure 4.2.16: Number of Children and Healthcare Costs')
plt.xlabel('Number of Children')
plt.ylabel('Charges')
plt.grid(True)
plt.savefig('/content/Figure_4_2_16_Children_vs_Costs.png')  # Save the figure
plt.show()
<ipython-input-35-882bfd53e39f>:3: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1')
No description has been provided for this image

Figure 4.2.17: Interaction Between Age and Smoking on Healthcare Costs

In [ ]:
# Creating a combined column for age and smoking status interaction
data['age_smoker'] = data['age'].astype(str) + '_' + data['smoker']

# Plotting the interaction between age and smoking status
plt.figure(figsize=(12, 8))
sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis')
plt.title('Figure 4.2.17: Interaction Between Age and Smoking Status')
plt.xlabel('Age and Smoking Status')
plt.ylabel('Charges')
plt.xticks(rotation=90)  # Rotate x-axis labels for readability
plt.grid(True)
plt.savefig('/content/Figure_4_2_17_Age_Smoking_Interaction.png')  # Save the figure
plt.show()
<ipython-input-36-c6b1813f4d4f>:6: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis')
No description has been provided for this image

all graphs in single file

In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('/content/insurance.csv')

# Create a figure with a grid of subplots
fig, axs = plt.subplots(4, 5, figsize=(20, 16))  # 4 rows, 5 columns

# Flatten the axes array for easy iteration
axs = axs.flatten()

# Plotting the distribution of age (Figure 4.2.1)
sns.histplot(data['age'], bins=20, kde=True, color='blue', ax=axs[0])
axs[0].set_title('Figure 4.2.1: Distribution of Age')
axs[0].set_xlabel('Age')
axs[0].set_ylabel('Frequency')

# Plotting the distribution of BMI (Figure 4.2.2)
sns.histplot(data['bmi'], bins=20, kde=True, color='orange', ax=axs[1])
axs[1].set_title('Figure 4.2.2: Distribution of BMI')
axs[1].set_xlabel('BMI')
axs[1].set_ylabel('Frequency')

# Plotting the distribution of healthcare costs (Figure 4.2.3)
sns.histplot(data['charges'], bins=20, kde=True, color='green', ax=axs[2])
axs[2].set_title('Figure 4.2.3: Distribution of Healthcare Costs')
axs[2].set_xlabel('Charges')
axs[2].set_ylabel('Frequency')

# Plotting the correlation matrix (Figure 4.2.4)
numeric_data = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axs[3])
axs[3].set_title('Figure 4.2.4: Correlation Matrix')

# Plotting actual vs predicted charges (Random Forest) (Figure 4.2.5)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = data.drop('charges', axis=1)
y = data['charges']
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

sns.scatterplot(x=y_test, y=y_pred_rf, color='purple', ax=axs[4])
axs[4].plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
axs[4].set_title('Figure 4.2.5: Actual vs Predicted Charges (Random Forest)')
axs[4].set_xlabel('Actual Charges')
axs[4].set_ylabel('Predicted Charges')

# Plotting actual vs predicted charges (Gradient Boosting) (Figure 4.2.6)
from sklearn.ensemble import GradientBoostingRegressor

model_gb = GradientBoostingRegressor()
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)

sns.scatterplot(x=y_test, y=y_pred_gb, color='cyan', ax=axs[5])
axs[5].plot([0, max(y_test)], [0, max(y_test)], color='red', linestyle='--')
axs[5].set_title('Figure 4.2.6: Actual vs Predicted Charges (Gradient Boosting)')
axs[5].set_xlabel('Actual Charges')
axs[5].set_ylabel('Predicted Charges')

# Plotting residuals distribution (Random Forest) (Figure 4.2.7)
residuals_rf = y_test - y_pred_rf
sns.histplot(residuals_rf, bins=20, kde=True, color='magenta', ax=axs[6])
axs[6].set_title('Figure 4.2.7: Residual Distribution (Random Forest)')
axs[6].set_xlabel('Residuals')
axs[6].set_ylabel('Frequency')

# Plotting residuals distribution (Gradient Boosting) (Figure 4.2.8)
residuals_gb = y_test - y_pred_gb
sns.histplot(residuals_gb, bins=20, kde=True, color='brown', ax=axs[7])
axs[7].set_title('Figure 4.2.8: Residual Distribution (Gradient Boosting)')
axs[7].set_xlabel('Residuals')
axs[7].set_ylabel('Frequency')

# Plotting learning curve (Random Forest) (Figure 4.2.9)
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(model_rf, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
axs[8].plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
axs[8].plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
axs[8].set_title('Figure 4.2.9: Learning Curve (Random Forest)')
axs[8].set_xlabel('Training Size')
axs[8].set_ylabel('Mean Squared Error')
axs[8].legend()
axs[8].grid(True)

# Plotting learning curve (Gradient Boosting) (Figure 4.2.10)
train_sizes, train_scores, test_scores = learning_curve(model_gb, X, y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
axs[9].plot(train_sizes, -train_scores.mean(axis=1), label='Train Error', color='blue')
axs[9].plot(train_sizes, -test_scores.mean(axis=1), label='Test Error', color='green')
axs[9].set_title('Figure 4.2.10: Learning Curve (Gradient Boosting)')
axs[9].set_xlabel('Training Size')
axs[9].set_ylabel('Mean Squared Error')
axs[9].legend()
axs[9].grid(True)

# Plotting overall distribution of healthcare costs (Figure 4.2.11)
sns.histplot(data['charges'], bins=30, kde=True, color='teal', ax=axs[10])
axs[10].set_title('Figure 4.2.11: Overall Distribution of Healthcare Costs')
axs[10].set_xlabel('Charges')
axs[10].set_ylabel('Frequency')

# Plotting relationship between age and healthcare costs (Figure 4.2.12)
sns.scatterplot(x=data['age'], y=data['charges'], color='red', ax=axs[11])
axs[11].set_title('Figure 4.2.12: Relationship Between Age and Healthcare Costs')
axs[11].set_xlabel('Age')
axs[11].set_ylabel('Charges')

# Plotting relationship between BMI and healthcare costs (Figure 4.2.13)
sns.scatterplot(x=data['bmi'], y=data['charges'], color='purple', ax=axs[12])
axs[12].set_title('Figure 4.2.13: Relationship Between BMI and Healthcare Costs')
axs[12].set_xlabel('BMI')
axs[12].set_ylabel('Charges')

# Plotting impact of smoking status on healthcare costs (Figure 4.2.14)
sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2', ax=axs[13])
axs[13].set_title('Figure 4.2.14: Impact of Smoking Status on Healthcare Costs')
axs[13].set_xlabel('Smoker')
axs[13].set_ylabel('Charges')

# Plotting regional differences in healthcare costs (Figure 4.2.15)
sns.boxplot(x=data['region'], y=data['charges'], palette='Set1', ax=axs[14])
axs[14].set_title('Figure 4.2.15: Regional Differences in Healthcare Costs')
axs[14].set_xlabel('Region')
axs[14].set_ylabel('Charges')

# Plotting number of children and healthcare costs (Figure 4.2.16)
sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1', ax=axs[15])
axs[15].set_title('Figure 4.2.16: Number of Children and Healthcare Costs')
axs[15].set_xlabel('Number of Children')
axs[15].set_ylabel('Charges')

# Plotting interaction between age and smoking status (Figure 4.2.17)
data['age_smoker'] = data['age'].astype(str) + '_' + data['smoker']
sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis', ax=axs[16])
axs[16].set_title('Figure 4.2.17: Interaction Between Age and Smoking Status')
axs[16].set_xlabel('Age and Smoking Status')
axs[16].set_ylabel('Charges')
axs[16].tick_params(axis='x', rotation=90)  # Rotate x-axis labels for readability

# Adjust layout to avoid overlap
plt.tight_layout()

# Save the entire figure
plt.savefig('/content/All_Figures_4_2.png')  # Save the figure
plt.show()
<ipython-input-37-df516952e2aa>:124: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=data['smoker'], y=data['charges'], palette='Set2', ax=axs[13])
<ipython-input-37-df516952e2aa>:130: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=data['region'], y=data['charges'], palette='Set1', ax=axs[14])
<ipython-input-37-df516952e2aa>:136: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=data['children'], y=data['charges'], palette='Pastel1', ax=axs[15])
<ipython-input-37-df516952e2aa>:143: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='age_smoker', y='charges', data=data, palette='viridis', ax=axs[16])
No description has been provided for this image
In [ ]:
# Step 1: Upload the notebook file
from google.colab import files

# Upload notebook file
uploaded = files.upload()

# The uploaded file name
notebook_file = list(uploaded.keys())[0]
print(f"Uploaded file: {notebook_file}")

# Step 2: Install nbconvert if not already installed
!pip install nbconvert

# Step 3: Convert the notebook to HTML
import subprocess

# Define the output HTML file name
output_html_file = notebook_file.replace('.ipynb', '.html')

# Convert the notebook to HTML
try:
    subprocess.run(['jupyter', 'nbconvert', '--to', 'html', notebook_file, '--output', output_html_file], check=True)
    print(f"Conversion successful: {output_html_file} created.")
except subprocess.CalledProcessError as e:
    print(f"Error during conversion: {e}")

# Step 4: Provide download link for the HTML file
from google.colab import files

# Download the HTML file
try:
    files.download(output_html_file)
    print(f"Download initiated for: {output_html_file}")
except Exception as e:
    print(f"Error during download: {e}")
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving Maheshinsurance (1).ipynb to Maheshinsurance (1).ipynb
Uploaded file: Maheshinsurance (1).ipynb
Requirement already satisfied: nbconvert in /usr/local/lib/python3.10/dist-packages (6.5.4)
Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.9.4)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (4.12.3)
Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from nbconvert) (6.1.0)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.7.1)
Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.4)
Requirement already satisfied: jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (3.1.4)
Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.2)
Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.3.0)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.1.5)
Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.8.4)
Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (0.10.0)
Requirement already satisfied: nbformat>=5.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.10.4)
Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from nbconvert) (24.1)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.5.1)
Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (2.16.1)
Requirement already satisfied: tinycss2 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (1.3.0)
Requirement already satisfied: traitlets>=5.0 in /usr/local/lib/python3.10/dist-packages (from nbconvert) (5.7.1)
Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.10/dist-packages (from jupyter-core>=4.7->nbconvert) (4.2.2)
Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.10/dist-packages (from nbclient>=0.5.0->nbconvert) (6.1.12)
Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (2.20.0)
Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.10/dist-packages (from nbformat>=5.1->nbconvert) (4.23.0)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->nbconvert) (2.5)
Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (1.16.0)
Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->nbconvert) (0.5.1)
Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (24.2.0)
Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (2023.12.1)
Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.35.1)
Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=2.6->nbformat>=5.1->nbconvert) (0.20.0)
Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (24.0.1)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (2.8.2)
Requirement already satisfied: tornado>=4.1 in /usr/local/lib/python3.10/dist-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.3.3)
Conversion successful: Maheshinsurance (1).html created.
Download initiated for: Maheshinsurance (1).html

DIagrams for the project

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta

# Define the start date of the project
start_date = datetime(2024, 6, 30)

# Define the tasks and their durations (in days)
tasks = [
    ("Introduction", 7),         # 1 week
    ("Literature Review", 14),   # 2 weeks
    ("Methodology", 7),          # 1 week
    ("Implementation", 14),      # 2 weeks
    ("FInding & Analysis", 7),       # 1 week
    ("Conclusion", 7)            # 1 week
]

# Calculate task start and end dates
task_dates = []
current_start = start_date
for task, duration in tasks:
    end_date = current_start + timedelta(days=duration)
    task_dates.append((task, current_start, end_date))
    current_start = end_date + timedelta(days=1)  # Adding 1 day for the next task to start

# Create the Gantt chart
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each task
for i, (task, start, end) in enumerate(task_dates):
    ax.barh(i, (end - start).days, left=start, color='skyblue', edgecolor='black')
    ax.text(start + (end - start) / 2, i, task, ha='center', va='center', color='black', fontsize=10)

# Set the date format on the x-axis
ax.xaxis.set_major_locator(mdates.WeekdayLocator(interval=7))
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))

# Set labels and title
ax.set_xlabel('Date')
ax.set_ylabel('Tasks')
ax.set_title('8-Week Project Plan (June 30th - September 5th, 2024)')
ax.set_yticks(range(len(tasks)))
ax.set_yticklabels([task for task, start, end in task_dates])

# Invert the y-axis to have the first task at the top
ax.invert_yaxis()

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Show the grid for better readability
plt.grid(True, axis='x', linestyle='--', alpha=0.6)

# Show the Gantt chart
plt.tight_layout()
plt.show()
No description has been provided for this image

2.4

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Define the data for the table
data = {
    "Algorithm": ["Random Forest", "Gradient Boosting Machines (GBM)", "XGBoost"],
    "Strengths": [
        "Handles high-dimensional data effectively; reduces overfitting through ensemble learning; robust to noisy data.",
        "Provides high accuracy by correcting errors of previous models; captures complex patterns and interactions.",
        "Optimized for speed and performance; includes regularization to prevent overfitting; handles missing values effectively."
    ],
    "Limitations": [
        "Computationally intensive for large datasets; less transparent.",
        "High risk of overfitting if not tuned correctly; requires significant computation.",
        "Requires careful hyperparameter tuning; implementation complexity."
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Plotting the table
fig, ax = plt.subplots(figsize=(10, 3))  # Set the figure size

ax.axis('tight')
ax.axis('off')
tbl = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')

# Adjusting font size
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)

# Adjust column widths
tbl.auto_set_column_width(col=list(range(len(df.columns))))

# Save the table as an image
plt.savefig('ML_Algorithm_Comparison_Table.png', bbox_inches='tight', dpi=300)
plt.show()
No description has been provided for this image

2.3

In [ ]:
import matplotlib.pyplot as plt
from graphviz import Digraph

# Create a new directed graph
dot = Digraph()

# Define the nodes
dot.node('A', 'Lack of Comprehensive Data Integration')
dot.node('B', 'Model Interpretability Issues')
dot.node('C', 'Ethical Concerns\n(Data Privacy and Security)')
dot.node('D', 'Need for High-Quality Datasets')

# Define the connections (edges) between nodes
dot.edge('A', 'B')
dot.edge('B', 'C')
dot.edge('C', 'D')

# Display the graph
dot.render('ML_Healthcare_Gaps', format='png', cleanup=False)
dot
Out[ ]:
No description has been provided for this image

2.2

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from graphviz import Digraph
from PIL import Image
import matplotlib.image as mpimg

# Step 1: Create and Save Table Image

# Define the data for the table
data = {
    "Technique": ["Linear Regression", "Time Series Analysis", "Random Forest", "Gradient Boosting Machines (GBM)", "XGBoost"],
    "Strengths": [
        "Simple, well-defined relationships",
        "Identifies patterns over time",
        "Handles high-dimensional data; reduces overfitting",
        "Captures complex patterns and interactions",
        "Optimized for speed; handles large datasets"
    ],
    "Limitations": [
        "Oversimplifies non-linear relationships",
        "Struggles with unexpected shifts and variability",
        "Computationally intensive; less transparent",
        "High risk of overfitting if not tuned correctly",
        "Requires careful tuning; implementation complexity"
    ]
}

df = pd.DataFrame(data)

# Plotting the table
fig, ax = plt.subplots(figsize=(12, 3))  # Adjust the size as needed
ax.axis('tight')
ax.axis('off')
tbl = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')

# Adjusting font size
tbl.auto_set_font_size(False)
tbl.set_fontsize(10)

# Adjust column widths
tbl.auto_set_column_width(col=list(range(len(df.columns))))

# Save the table as an image
plt.savefig('table_image.png', bbox_inches='tight', dpi=300)
plt.close()

# Step 2: Create and Save Bar Chart

fig, ax = plt.subplots(figsize=(12, 6))

# Combine strengths and limitations for each technique
strengths = np.array([2, 3, 4, 4, 4])
limitations = np.array([3, 3, 3, 4, 4])

index = np.arange(len(df["Technique"]))
bar_width = 0.35

bar1 = plt.bar(index, strengths, bar_width, label="Strengths", color='skyblue')
bar2 = plt.bar(index + bar_width, limitations, bar_width, label="Limitations", color='salmon')

plt.xlabel('Techniques')
plt.ylabel('Score (1-5)')
plt.title('Comparison of Strengths and Limitations')
plt.xticks(index + bar_width / 2, df["Technique"], rotation=30, ha="right")
plt.legend()

# Adding the text on top of the bars
for bar in bar1:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.1, yval, ha='center', va='bottom')

for bar in bar2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.1, yval, ha='center', va='bottom')

# Save the bar chart as an image
plt.tight_layout()
plt.savefig('bar_chart_image.png', dpi=300)
plt.close()

# Step 3: Create and Save Flowchart

dot = Digraph()

# Define the nodes
dot.node('A', 'Traditional Techniques')
dot.node('B', 'Linear Regression')
dot.node('C', 'Time Series Analysis')
dot.node('D', 'Machine Learning Techniques')
dot.node('E', 'Random Forest')
dot.node('F', 'Gradient Boosting Machines (GBM)')
dot.node('G', 'XGBoost')

# Define the connections (edges) between nodes
dot.edge('A', 'B')
dot.edge('A', 'C')
dot.edge('A', 'D')
dot.edge('D', 'E')
dot.edge('D', 'F')
dot.edge('D', 'G')

# Render the flowchart to an image
dot.render('flowchart_image', format='png', cleanup=False)

# Step 4: Combine All Images into a Single Image

# Load images
table_img = Image.open('table_image.png')
bar_chart_img = Image.open('bar_chart_image.png')
flowchart_img = Image.open('flowchart_image.png')

# Create a new image with enough space to hold all three images
combined_width = max(table_img.width, bar_chart_img.width, flowchart_img.width)
combined_height = table_img.height + bar_chart_img.height + flowchart_img.height

combined_img = Image.new('RGB', (combined_width, combined_height), (255, 255, 255))

# Paste images into the combined image
combined_img.paste(table_img, (0, 0))
combined_img.paste(bar_chart_img, (0, table_img.height))
combined_img.paste(flowchart_img, (0, table_img.height + bar_chart_img.height))

# Save the combined image
combined_img.save('combined_image.png')

# Display the combined image to check the output
combined_img.show()

3 Methodology

In [ ]:
from graphviz import Digraph
import os

# Define the save path
save_path = '/content/drive/MyDrive/Projects'
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Define the filename and full path
filename = 'methodology_flowchart.png'
full_path = os.path.join(save_path, filename)

# Define the diagram
dot = Digraph()

# Define the nodes
dot.node('A', 'Data Collection')
dot.node('B', 'Data Preprocessing')
dot.node('C', 'Algorithm Selection')
dot.node('D', 'Evaluation Metrics')
dot.node('E', 'Model Performance Assessment')

# Define the connections (edges) between nodes
dot.edge('A', 'B', label='Source Datasets from Kaggle & Academic Databases')
dot.edge('B', 'C', label='Ensure Data Quality and Relevance')
dot.edge('C', 'D', label='Use Advanced ML Algorithms')
dot.edge('D', 'E', label='Assess Model Performance')

# Render the flowchart to an image
dot.render(filename=full_path.replace('.png', ''), format='png', cleanup=False)

print(f"Flowchart saved as '{full_path}'")
Flowchart saved as '/content/drive/MyDrive/dissertations/Mahesh/methodology_flowchart.png'

3.1

In [ ]:
from graphviz import Digraph
import os

# Define the save path
save_path = '/content/drive/MyDrive/Projects'
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Define the filename and full path
filename = 'methodology_detailed_flowchart.png'
full_path = os.path.join(save_path, filename)

# Define the diagram
dot = Digraph()

# Define the nodes
dot.node('A', 'Traditional Methods')
dot.node('A1', 'Linear Regression')
dot.node('A2', 'Time-Series Analysis')
dot.node('B', 'Advanced Approach')
dot.node('B1', 'Medical Cost Personal Dataset')
dot.node('B2', 'Insights from IEEE Xplore & Google Scholar')
dot.node('C', 'Implementation')
dot.node('C1', 'Google Colab Environment')
dot.node('C2', 'Python Libraries\n(Pandas, NumPy, Scikit-learn)')
dot.node('C3', 'Data Visualization\n(Matplotlib, Seaborn)')
dot.node('C4', 'Google Drive Integration')

# Define the connections (edges) between nodes
dot.edge('A', 'B', label='Overcoming Limitations')
dot.edge('A1', 'A', label='Used for Basic Analysis')
dot.edge('A2', 'A', label='Used for Trend Identification')
dot.edge('B', 'C', label='Advanced Methodology')
dot.edge('B1', 'B', label='Comprehensive Dataset')
dot.edge('B2', 'B', label='Scholarly Insights')
dot.edge('C', 'C1', label='Computational Resources')
dot.edge('C', 'C2', label='Data Manipulation and ML')
dot.edge('C', 'C3', label='Model Performance Visualization')
dot.edge('C', 'C4', label='Data Storage and Access')

# Render the flowchart to an image
dot.render(filename=full_path.replace('.png', ''), format='png', cleanup=False)

print(f"Flowchart saved as '{full_path}'")
Flowchart saved as '/content/drive/MyDrive/dissertations/Mahesh/methodology_detailed_flowchart.png'

3.2

In [ ]:
from graphviz import Digraph
import os

# Define the save path
save_path = '/content/drive/MyDrive/Projects'
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Define the filename and full path
filename = 'data_preprocessing_flowchart.png'
full_path = os.path.join(save_path, filename)

# Define the diagram
dot = Digraph()

# Define the nodes for different preprocessing steps
dot.node('A', 'Data Preprocessing')
dot.node('A1', 'Data Cleaning')
dot.node('A2', 'Feature Engineering')
dot.node('A3', 'Normalization and Scaling')
dot.node('A4', 'Outlier Detection')
dot.node('B', 'Implementation in Google Colab')

# Define the nodes for details within each step
dot.node('A1.1', 'Remove Duplicate Entries')
dot.node('A1.2', 'Handle Missing Values:\n - Mean Imputation\n - Mode Imputation')

dot.node('A2.1', 'Generate Interaction Terms')
dot.node('A2.2', 'One-Hot Encoding')

dot.node('A3.1', 'Min-Max Scaling')

dot.node('A4.1', 'Detect Outliers using Z-Scores')
dot.node('A4.2', 'Transform or Remove Outliers')

# Define the connections (edges) between nodes
dot.edge('A', 'A1', label='Step 1: Data Cleaning')
dot.edge('A1', 'A1.1', label='Step 1.1')
dot.edge('A1', 'A1.2', label='Step 1.2')

dot.edge('A', 'A2', label='Step 2: Feature Engineering')
dot.edge('A2', 'A2.1', label='Step 2.1')
dot.edge('A2', 'A2.2', label='Step 2.2')

dot.edge('A', 'A3', label='Step 3: Normalization and Scaling')
dot.edge('A3', 'A3.1', label='Step 3.1')

dot.edge('A', 'A4', label='Step 4: Outlier Detection')
dot.edge('A4', 'A4.1', label='Step 4.1')
dot.edge('A4', 'A4.2', label='Step 4.2')

dot.edge('A', 'B', label='Implementation in Google Colab')
dot.edge('B', 'B', label='Facilitates All Steps')

# Render the flowchart to an image
dot.render(filename=full_path.replace('.png', ''), format='png', cleanup=False)

print(f"Flowchart saved as '{full_path}'")
Flowchart saved as '/content/drive/MyDrive/dissertations/Mahesh/data_preprocessing_flowchart.png'

3.3

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def draw_random_forest_diagram():
    fig, ax = plt.subplots(figsize=(10, 6))

    ax.set_title('Figure 3.3.1: Random Forest Method', fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')

    # Define components
    components = ['Decision Tree 1', 'Decision Tree 2', 'Decision Tree 3', 'Decision Tree 4', 'Aggregated Prediction']
    positions = [(0.1, 0.6), (0.3, 0.6), (0.5, 0.6), (0.7, 0.6), (0.4, 0.3)]

    # Draw components
    for component, (x, y) in zip(components, positions):
        rect = patches.FancyBboxPatch((x, y), 0.2, 0.1, boxstyle="round,pad=0.05", edgecolor='black', facecolor='lightblue', lw=1.5)
        ax.add_patch(rect)
        ax.text(x + 0.1, y + 0.05, component, ha='center', va='center', fontsize=10, fontweight='bold')

    # Draw connections
    connections = [((0.1, 0.6), (0.4, 0.3)), ((0.3, 0.6), (0.4, 0.3)), ((0.5, 0.6), (0.4, 0.3)), ((0.7, 0.6), (0.4, 0.3))]
    for start, end in connections:
        ax.annotate('', xy=end, xytext=start, arrowprops=dict(arrowstyle="->", lw=1.5))

    # Save and show the figure
    plt.savefig('/content/drive/MyDrive/Projects/random_forest_diagram.png', bbox_inches='tight')
    plt.show()

draw_random_forest_diagram()
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def draw_gbm_diagram():
    fig, ax = plt.subplots(figsize=(10, 6))

    ax.set_title('Figure 3.3.2: Gradient Boosting Machines Method', fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')

    # Define components
    components = ['Model 1', 'Model 2', 'Model 3', 'Model 4']
    positions = [(0.2, 0.6), (0.2, 0.4), (0.2, 0.2), (0.2, 0.0)]

    # Draw components
    for component, (x, y) in zip(components, positions):
        rect = patches.FancyBboxPatch((x, y), 0.2, 0.1, boxstyle="round,pad=0.05", edgecolor='black', facecolor='lightgreen', lw=1.5)
        ax.add_patch(rect)
        ax.text(x + 0.1, y + 0.05, component, ha='center', va='center', fontsize=10, fontweight='bold')

    # Draw connections
    connections = [((0.2, 0.6), (0.2, 0.4)), ((0.2, 0.4), (0.2, 0.2)), ((0.2, 0.2), (0.2, 0.0))]
    for start, end in connections:
        ax.annotate('', xy=end, xytext=start, arrowprops=dict(arrowstyle="->", lw=1.5))

    # Save and show the figure
    plt.savefig('/content/drive/MyDrive/Projects/gbm_diagram.png', bbox_inches='tight')
    plt.show()

draw_gbm_diagram()
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def draw_xgboost_diagram():
    fig, ax = plt.subplots(figsize=(10, 6))

    ax.set_title('Figure 3.3.3: XGBoost Method', fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')

    # Define components
    components = ['XGBoost', 'Regularization', 'Parallel Processing']
    positions = [(0.3, 0.4), (0.1, 0.6), (0.5, 0.6)]

    # Draw components
    for component, (x, y) in zip(components, positions):
        rect = patches.FancyBboxPatch((x, y), 0.2, 0.1, boxstyle="round,pad=0.05", edgecolor='black', facecolor='lightcoral', lw=1.5)
        ax.add_patch(rect)
        ax.text(x + 0.1, y + 0.05, component, ha='center', va='center', fontsize=10, fontweight='bold')

    # Draw connections
    connections = [((0.3, 0.4), (0.1, 0.6)), ((0.3, 0.4), (0.5, 0.6))]
    for start, end in connections:
        ax.annotate('', xy=end, xytext=start, arrowprops=dict(arrowstyle="->", lw=1.5))

    # Save and show the figure
    plt.savefig('/content/drive/MyDrive/Projects/xgboost_diagram.png', bbox_inches='tight')
    plt.show()

draw_xgboost_diagram()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

3.3 tables

In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table

def plot_random_forest_table():
    # Data for Random Forest method
    data_rf = {
        'Aspect': ['Technique', 'Core Concept', 'Training Process', 'Prediction Aggregation', 'Overfitting Mitigation', 'Feature Handling', 'Strengths', 'Application', 'Reference'],
        'Description': [
            'Ensemble Learning Method',
            'Aggregates the predictions of multiple decision trees to enhance accuracy and robustness',
            'Creates numerous decision trees, each trained on a subset of the data using a bootstrap sample',
            'Combines tree predictions through majority voting (classification) or averaging (regression)',
            'Reduces variance and improves generalization by averaging results from multiple trees',
            'Manages high-dimensional data effectively and captures complex patterns by considering various features and interactions simultaneously',
            '• Handles high-dimensional data well\n• Captures complex interactions\n• Robust to overfitting',
            'Useful for predicting healthcare costs by analyzing interactions between variables such as age, BMI, and smoking status',
            'Aldahiri, A., Alrashed, B., & Hussain, W. (2021). Trends in using IoT with machine learning in health prediction system. Forecasting.'
        ]
    }
    df_rf = pd.DataFrame(data_rf)

    # Plot the table
    fig, ax = plt.subplots(figsize=(10, 6))  # Set the figure size
    ax.axis('off')  # Hide axes
    tbl = table(ax, df_rf, loc='center', cellLoc='left', colWidths=[0.4] * len(df_rf.columns))
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1.2, 1.2)
    ax.set_title('Table: Summary of Random Forest Method', fontsize=14, fontweight='bold')

    # Save the table as an image
    plt.savefig('/content/drive/MyDrive/Projects/random_forest_table.png', bbox_inches='tight')
    plt.show()

plot_random_forest_table()
No description has been provided for this image
In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table

def plot_gbm_table():
    # Data for GBM method
    data_gbm = {
        'Aspect': ['Method', 'Key Characteristics', 'Process', 'Strengths', 'Application'],
        'Description': [
            'Gradient Boosting Machines (GBM)',
            'Builds models sequentially, each correcting errors of its predecessor',
            'Iterative training to refine predictions and capture complex relationships',
            'Excels in modelling complex interactions and non-linear patterns, improving prediction accuracy incrementally',
            'Effective for predicting healthcare costs by addressing nuances and interactions in the data (Wang, 2021)'
        ]
    }
    df_gbm = pd.DataFrame(data_gbm)

    # Plot the table
    fig, ax = plt.subplots(figsize=(10, 6))  # Set the figure size
    ax.axis('off')  # Hide axes
    tbl = table(ax, df_gbm, loc='center', cellLoc='left', colWidths=[0.4] * len(df_gbm.columns))
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1.2, 1.2)
    ax.set_title('Table: Overview of Gradient Boosting Machines (GBM)', fontsize=14, fontweight='bold')

    # Save the table as an image
    plt.savefig('/content/drive/MyDrive/Projects/gbm_table.png', bbox_inches='tight')
    plt.show()

plot_gbm_table()
No description has been provided for this image

3.3.3 Table

In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import table

def plot_xgboost_table():
    # Data for XGBoost method
    data_xgboost = {
        'Aspect': ['Method', 'Key Characteristics', 'Features', 'Strengths', 'Application'],
        'Description': [
            'XGBoost',
            'Optimized variant of GBM with advanced features',
            'Regularization techniques to prevent overfitting, parallel processing for faster computation',
            'Handles large datasets and complex feature interactions efficiently, improving prediction accuracy and reliability',
            'Particularly effective for healthcare cost prediction by addressing limitations of traditional methods (Johnson et al., 2023)'
        ]
    }
    df_xgboost = pd.DataFrame(data_xgboost)

    # Plot the table
    fig, ax = plt.subplots(figsize=(10, 6))  # Set the figure size
    ax.axis('off')  # Hide axes
    tbl = table(ax, df_xgboost, loc='center', cellLoc='left', colWidths=[0.4] * len(df_xgboost.columns))
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(10)
    tbl.scale(1.2, 1.2)
    ax.set_title('Table: Overview of XGBoost Method', fontsize=14, fontweight='bold')

    # Save the table as an image
    plt.savefig('/content/drive/MyDrive/Projects/xgboost_table.png', bbox_inches='tight')
    plt.show()

plot_xgboost_table()
No description has been provided for this image

3.4.1

In [ ]:
import matplotlib.pyplot as plt

def plot_mae_diagram():
    # Data for MAE diagram
    models = ['Random Forest', 'GBM', 'XGBoost']
    mae_values = [0.23, 0.18, 0.15]  # Example MAE values; replace with actual values

    # Plotting MAE
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(models, mae_values, color=['blue', 'orange', 'green'])
    ax.set_xlabel('Models', fontsize=12)
    ax.set_ylabel('Mean Absolute Error (MAE)', fontsize=12)
    ax.set_title('Figure 3.4.1: Mean Absolute Error (MAE) for Each Model', fontsize=14, fontweight='bold')
    ax.set_ylim(0, max(mae_values) + 0.1)

    # Adding value labels on the bars
    for i, value in enumerate(mae_values):
        ax.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom')

    plt.savefig('/content/drive/MyDrive/Projects/mae_diagram.png', bbox_inches='tight')
    plt.show()

plot_mae_diagram()
import matplotlib.pyplot as plt

def plot_rmse_diagram():
    # Data for RMSE diagram
    models = ['Random Forest', 'GBM', 'XGBoost']
    rmse_values = [0.30, 0.25, 0.20]  # Example RMSE values; replace with actual values

    # Plotting RMSE
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(models, rmse_values, color=['blue', 'orange', 'green'])
    ax.set_xlabel('Models', fontsize=12)
    ax.set_ylabel('Root Mean Squared Error (RMSE)', fontsize=12)
    ax.set_title('Figure 3.4.2: Root Mean Squared Error (RMSE) for Each Model', fontsize=14, fontweight='bold')
    ax.set_ylim(0, max(rmse_values) + 0.1)

    # Adding value labels on the bars
    for i, value in enumerate(rmse_values):
        ax.text(i, value + 0.01, f'{value:.2f}', ha='center', va='bottom')

    plt.savefig('/content/drive/MyDrive/Projects/rmse_diagram.png', bbox_inches='tight')
    plt.show()

plot_rmse_diagram()
import matplotlib.pyplot as plt

def plot_r_squared_diagram():
    # Data for R-Squared diagram
    models = ['Random Forest', 'GBM', 'XGBoost']
    r_squared_values = [0.75, 0.80, 0.85]  # Example R² values; replace with actual values

    # Plotting R-Squared
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.bar(models, r_squared_values, color=['blue', 'orange', 'green'])
    ax.set_xlabel('Models', fontsize=12)
    ax.set_ylabel('R-Squared (R²)', fontsize=12)
    ax.set_title('Figure 3.4.3: R-Squared (R²) for Each Model', fontsize=14, fontweight='bold')
    ax.set_ylim(0, 1.1)

    # Adding value labels on the bars
    for i, value in enumerate(r_squared_values):
        ax.text(i, value + 0.02, f'{value:.2f}', ha='center', va='bottom')

    plt.savefig('/content/drive/MyDrive/Projects/r_squared_diagram.png', bbox_inches='tight')
    plt.show()

plot_r_squared_diagram()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

5 chapter

In [ ]:
import matplotlib.pyplot as plt

# Define data
aspects = ['Data Privacy', 'Algorithmic Bias', 'Responsible Use']
percentages = [33, 33, 34]  # Assuming equal importance for simplicity

# Create pie chart
fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(percentages, labels=aspects, autopct='%1.1f%%', colors=['skyblue', 'lightgreen', 'lightcoral'], startangle=140)
ax.set_title('Ethical Aspects of ML in Healthcare', fontsize=16, fontweight='bold')

# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/ethical_aspects_diagram.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

5.1

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 10))

# Define text properties
textprops = dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightblue')

# Define positions and labels for each step
steps = [
    ("Data Anonymization\n& De-Identification", (0.5, 0.9)),
    ("Access Controls\n(Role-Based)", (0.5, 0.75)),
    ("Encryption\n(At Rest & In Transit)", (0.5, 0.6)),
    ("Informed Consent\n(Transparency)", (0.5, 0.45)),
    ("Regular Audits\n& Assessments", (0.5, 0.3)),
    ("Data Stewardship\n(Compliance)", (0.5, 0.15)),
    ("Regulatory Compliance\n(GDPR, HIPAA)", (0.5, 0.0))
]

# Add text boxes for each step
for step, pos in steps:
    ax.text(pos[0], pos[1], step, ha='center', va='center', fontsize=12, bbox=textprops)

# Add arrows between steps
for i in range(len(steps) - 1):
    start = steps[i][1]
    end = steps[i + 1][1]
    ax.annotate('', xy=end, xytext=start,
                arrowprops=dict(facecolor='black', shrink=0.05))

# Set title and hide axes
ax.set_title('Data Privacy Considerations in ML Healthcare Applications', fontsize=16, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(-0.1, 1)
ax.axis('off')

# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/data_privacy_flowchart.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

5.2

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 12))

# Define text properties
textprops = dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightgreen')

# Define positions and labels for each step
steps = [
    ("Biased Training Data", (0.5, 0.9)),
    ("Bias Mitigation\n Techniques", (0.5, 0.75)),
    ("Fairness Metrics\n Evaluation", (0.5, 0.6)),
    ("Ongoing Monitoring\n& Updates", (0.5, 0.45)),
    ("Transparency\n (Model Explainability)", (0.5, 0.3)),
    ("Diverse Team\nComposition", (0.5, 0.15)),
    ("Collaboration\n (Ethicists & Experts)", (0.5, 0.0))
]

# Add text boxes for each step
for step, pos in steps:
    ax.text(pos[0], pos[1], step, ha='center', va='center', fontsize=12, bbox=textprops)

# Add arrows between steps
for i in range(len(steps) - 1):
    start = steps[i][1]
    end = steps[i + 1][1]
    ax.annotate('', xy=end, xytext=start,
                arrowprops=dict(facecolor='black', shrink=0.05))

# Set title and hide axes
ax.set_title('Addressing Algorithmic Bias in ML Healthcare Models', fontsize=16, fontweight='bold')
ax.set_xlim(0, 1)
ax.set_ylim(-0.1, 1)
ax.axis('off')

# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/algorithmic_bias_flowchart.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

5.3

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 12))

# Define text properties
textprops = dict(boxstyle='round,pad=0.3', edgecolor='black', facecolor='lightblue')

# Define positions and labels for each recommendation
recommendations = [
    ("Transparency\n& Explainability", (0.5, 0.9)),
    ("Ethical Governance\nFramework", (0.5, 0.75)),
    ("Proactive Bias\nMitigation", (0.5, 0.6)),
    ("Compliance with\nLegal Regulations", (0.5, 0.45)),
    ("Stakeholder Engagement", (0.5, 0.3)),
    ("Ongoing Education\n& Training", (0.5, 0.15))
]

# Add text boxes for each recommendation
for rec, pos in recommendations:
    ax.text(pos[0], pos[1], rec, ha='center', va='center', fontsize=12, bbox=textprops)

# Add arrows between recommendations
for i in range(len(recommendations) - 1):
    start = recommendations[i][1]
    end = recommendations[i + 1][1]
    ax.annotate('', xy=end, xytext=start,
                arrowprops=dict(facecolor='black', shrink=0.05))

# Add an overall title
ax.set_title('Recommendations for Responsible Use of ML in Healthcare', fontsize=16, fontweight='bold')

# Hide axes
ax.set_xlim(0, 1)
ax.set_ylim(-0.1, 1)
ax.axis('off')

# Save the diagram
plt.savefig('/content/drive/MyDrive/Projects/responsible_ml_use_recommendations.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

6.1

In [ ]:
import matplotlib.pyplot as plt
import numpy as np

# Define the models and their performance metrics
models = ['Random Forest', 'Gradient Boosting']
mae_values = [2664.97, 2490.64]
rmse_values = [4634.45, 4438.10]

x = np.arange(len(models))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10, 6))

# Plot MAE and RMSE
rects1 = ax.bar(x - width/2, mae_values, width, label='MAE')
rects2 = ax.bar(x + width/2, rmse_values, width, label='RMSE')

# Add some text for labels, title and custom x-axis tick labels
ax.set_xlabel('Models')
ax.set_ylabel('Values')
ax.set_title('Comparison of ML Models Performance')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

# Add value annotations on bars
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/model_performance_comparison.png')
plt.show()
No description has been provided for this image

6.1 table

In [ ]:
import matplotlib.pyplot as plt
import pandas as pd

# Data for the table
data = {
    'Metric': ['Mean Absolute Error (MAE)', 'Root Mean Squared Error (RMSE)', 'Average R² Score'],
    'Random Forest': [2664.97, 4634.45, 0.83],
    'Gradient Boosting': [2490.64, 4438.10, 0.86]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Plot the table
fig, ax = plt.subplots(figsize=(8, 4))  # set size frame
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df.values, colLabels=df.columns, rowLabels=df['Metric'], cellLoc='center', loc='center')

# Save and show the table
plt.savefig('/content/drive/MyDrive/Projects/model_performance_summary.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

6.2

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Define colors and coordinates
colors = ['lightblue', 'lightgreen', 'lightcoral']
stakeholders = ['Healthcare Providers', 'Insurance Companies', 'Policymakers']
implications = [
    'Optimize resource allocation and patient management\nTailor services to high-risk populations\nReduce overall healthcare expenditures',
    'Enhance risk assessment and pricing strategies\nSet premiums more accurately\nDevelop personalized insurance products',
    'Improve transparency and fairness\nInform policy decisions\nDevelop equitable strategies for resource distribution'
]
coordinates = [(0.2, 0.8), (0.2, 0.5), (0.2, 0.2)]

# Add stakeholder boxes
for (x, y), stakeholder, implication, color in zip(coordinates, stakeholders, implications, colors):
    rect = patches.FancyBboxPatch((x - 0.1, y - 0.1), 0.4, 0.2, boxstyle="round,pad=0.1", edgecolor='black', facecolor=color, linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y, stakeholder, ha='center', va='center', fontsize=12, fontweight='bold')
    ax.text(x, y - 0.05, implication, ha='center', va='center', fontsize=10, wrap=True)

# Draw arrows
for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:]):
    ax.annotate('', xy=(x2, y2 + 0.1), xytext=(x1, y1 - 0.1),
                arrowprops=dict(arrowstyle='->', lw=2))

# Set limits and hide axes
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')

# Title
plt.title('Implications of ML Models for Stakeholders in Healthcare', fontsize=14, fontweight='bold')

# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/ml_impact_stakeholders.png')
plt.show()
No description has been provided for this image
In [ ]:
import matplotlib.pyplot as plt
import pandas as pd

# Data for the table
data = {
    'Stakeholder': ['Healthcare Providers', 'Insurance Companies', 'Policymakers'],
    'Implications': [
        'Optimize resource allocation and patient management; Tailor services to high-risk populations; Reduce overall healthcare expenditures',
        'Enhance risk assessment and pricing strategies; Set premiums more accurately; Develop personalized insurance products',
        'Improve transparency and fairness; Inform policy decisions; Develop equitable strategies for resource distribution'
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Plot the table
fig, ax = plt.subplots(figsize=(10, 5))  # set size frame
ax.axis('tight')
ax.axis('off')
table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='left', loc='center', colColours=['#f5f5f5']*2)

# Save and show the table
plt.savefig('/content/drive/MyDrive/Projects/ml_implications_table.png', bbox_inches='tight')
plt.show()
No description has been provided for this image

6.3

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Create a figure and axis
fig, ax = plt.subplots(figsize=(14, 10))

# Define colors and coordinates
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightgoldenrodyellow']
research_areas = [
    'Advanced ML Algorithms',
    'Incorporating Additional Features',
    'Fairness and Equity in ML',
    'Real-World Integration'
]
details = [
    'Explore deep learning techniques\n(e.g., CNNs, RNNs)\nAssess performance vs. traditional models',
    'Include additional variables\n(e.g., socioeconomic status, longitudinal data)\nEnhance prediction precision',
    'Develop fairness-aware algorithms\nImplement bias correction methods\nEnsure equity across demographics',
    'Evaluate ML model implementation in real-world settings\nAssess integration with health information systems\nIdentify practical challenges'
]
coordinates = [(0.2, 0.8), (0.2, 0.6), (0.2, 0.4), (0.2, 0.2)]

# Add research area boxes
for (x, y), area, detail, color in zip(coordinates, research_areas, details, colors):
    rect = patches.FancyBboxPatch((x - 0.15, y - 0.1), 0.3, 0.2, boxstyle="round,pad=0.1", edgecolor='black', facecolor=color, linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y + 0.05, area, ha='center', va='center', fontsize=12, fontweight='bold')
    ax.text(x, y - 0.05, detail, ha='center', va='center', fontsize=10, wrap=True)

# Draw arrows
for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:]):
    ax.annotate('', xy=(x2, y2 + 0.1), xytext=(x1, y1 - 0.1),
                arrowprops=dict(arrowstyle='->', lw=2))

# Set limits and hide axes
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')

# Title
plt.title('Future Research Directions in ML for Healthcare Cost Prediction', fontsize=14, fontweight='bold')

# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/future_research_directions.png')
plt.show()
No description has been provided for this image

6.4

In [ ]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Create a figure and axis
fig, ax = plt.subplots(figsize=(14, 10))

# Define colors and coordinates
colors = ['lightblue', 'lightgreen', 'lightcoral', 'lightgoldenrodyellow']
topics = [
    'ML Models: Random Forest vs. Gradient Boosting',
    'Accuracy Improvement',
    'Ethical Considerations',
    'Future Research Directions'
]
details = [
    'Gradient Boosting: MAE = 2,490.64\nRMSE = 4,438.10\nOutperforms Random Forest',
    'Enhanced Predictive Accuracy\nBetter Resource Allocation\nFinancial Planning',
    'Transparency\nData Privacy\nBias Mitigation',
    'Advanced Algorithms\nBroader Feature Sets\nReal-World Implementation'
]
coordinates = [(0.2, 0.8), (0.2, 0.6), (0.2, 0.4), (0.2, 0.2)]

# Add topic boxes
for (x, y), topic, detail, color in zip(coordinates, topics, details, colors):
    rect = patches.FancyBboxPatch((x - 0.15, y - 0.1), 0.3, 0.2, boxstyle="round,pad=0.1", edgecolor='black', facecolor=color, linewidth=2)
    ax.add_patch(rect)
    ax.text(x, y + 0.05, topic, ha='center', va='center', fontsize=12, fontweight='bold')
    ax.text(x, y - 0.05, detail, ha='center', va='center', fontsize=10, wrap=True)

# Draw arrows between topics
for (x1, y1), (x2, y2) in zip(coordinates, coordinates[1:]):
    ax.annotate('', xy=(x2, y2 + 0.1), xytext=(x1, y1 - 0.1),
                arrowprops=dict(arrowstyle='->', lw=2))

# Set limits and hide axes
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')

# Title
plt.title('Final Thoughts on ML for Healthcare Cost Prediction', fontsize=14, fontweight='bold')

# Save and show the diagram
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Projects/final_thoughts_concept_map.png')
plt.show()
No description has been provided for this image
In [ ]:
from google.colab import files
import nbformat
from nbconvert import HTMLExporter

# Step 1: Upload the .ipynb file
uploaded = files.upload()

# Assuming only one file is uploaded, get the file name
notebook_filename = next(iter(uploaded))

# Step 2: Convert the .ipynb file to HTML
with open(notebook_filename) as f:
    notebook_content = f.read()

# Parse the notebook content
notebook = nbformat.reads(notebook_content, as_version=4)

# Convert to HTML
html_exporter = HTMLExporter()
(html_content, _) = html_exporter.from_notebook_node(notebook)

# Step 3: Save the HTML content to a file
html_filename = notebook_filename.replace('.ipynb', '.html')
with open(html_filename, 'w') as f:
    f.write(html_content)

# Step 4: Download the HTML file
files.download(html_filename)
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving Maheshinsurance (3).ipynb to Maheshinsurance (3) (1).ipynb
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-b42eeead7a05> in <cell line: 20>()
     18 # Convert to HTML
     19 html_exporter = HTMLExporter()
---> 20 (html_content, _) = html_exporter.from_notebook_node(notebook)
     21 
     22 # Step 3: Save the HTML content to a file

/usr/local/lib/python3.10/dist-packages/nbconvert/exporters/html.py in from_notebook_node(self, nb, resources, **kw)
    221         self.register_filter("highlight_code", highlight_code)
    222         self.register_filter("filter_data_type", filter_data_type)
--> 223         return super().from_notebook_node(nb, resources, **kw)
    224 
    225     def _init_resources(self, resources):

/usr/local/lib/python3.10/dist-packages/nbconvert/exporters/templateexporter.py in from_notebook_node(self, nb, resources, **kw)
    411 
    412         # Top level variables are passed to the template_exporter here.
--> 413         output = self.template.render(nb=nb_copy, resources=resources)
    414         output = output.lstrip("\r\n")
    415         return output, resources

/usr/local/lib/python3.10/dist-packages/jinja2/environment.py in render(self, *args, **kwargs)
   1302             return self.environment.concat(self.root_render_func(ctx))  # type: ignore
   1303         except Exception:
-> 1304             self.environment.handle_exception()
   1305 
   1306     async def render_async(self, *args: t.Any, **kwargs: t.Any) -> str:

/usr/local/lib/python3.10/dist-packages/jinja2/environment.py in handle_exception(self, source)
    937         from .debug import rewrite_traceback_stack
    938 
--> 939         raise rewrite_traceback_stack(source=source)
    940 
    941     def join_path(self, template: str, parent: str) -> str:

/usr/local/share/jupyter/nbconvert/templates/lab/index.html.j2 in top-level template code()
      1 {%- extends 'base.html.j2' -%}
      2 {% from 'mathjax.html.j2' import mathjax %}
----> 3 {% from 'jupyter_widgets.html.j2' import jupyter_widgets %}
      4 
      5 {%- block header -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in top-level template code()
      1 {%- extends 'display_priority.j2' -%}
----> 2 {% from 'celltags.j2' import celltags %}
      3 
      4 {% block codecell %}
      5 {%- if not cell.outputs -%}

/usr/local/share/jupyter/nbconvert/templates/base/display_priority.j2 in top-level template code()
----> 1 {%- extends 'base/null.j2' -%}
      2 
      3 {#display data priority#}
      4 
      5 

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in top-level template code()
     24 {%- block header -%}
     25 {%- endblock header -%}
---> 26 {%- block body -%}
     27     {%- block body_header -%}
     28     {%- endblock body_header -%}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'body'()
     27     {%- block body_header -%}
     28     {%- endblock body_header -%}
---> 29     {%- block body_loop -%}
     30         {%- for cell in nb.cells -%}
     31             {%- block any_cell scoped -%}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'body_loop'()
     29     {%- block body_loop -%}
     30         {%- for cell in nb.cells -%}
---> 31             {%- block any_cell scoped -%}
     32                 {%- if cell.cell_type == 'code'-%}
     33                     {%- if resources.global_content_filter.include_code -%}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'any_cell'()
     32                 {%- if cell.cell_type == 'code'-%}
     33                     {%- if resources.global_content_filter.include_code -%}
---> 34                     {%- block codecell scoped -%}
     35                         {%- if resources.global_content_filter.include_input and not cell.get("transient",{}).get("remove_source", false) -%}
     36                             {%- block input_group -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'codecell'()
     10 {%- endif -%}
     11 <div class="jp-Cell jp-CodeCell jp-Notebook-cell {{ no_output_class }} {{ no_input_class }} {{ celltags(cell) }}">
---> 12 {{ super() }}
     13 </div>
     14 {%- endblock codecell %}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'codecell'()
     42                         {%- endif -%}
     43                         {%- if cell.outputs and resources.global_content_filter.include_output -%}
---> 44                             {%- block output_group -%}
     45                                 {%- if resources.global_content_filter.include_output_prompt -%}
     46                                     {%- block output_prompt -%}{%- endblock output_prompt -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'output_group'()
     36 <div class="jp-Collapser jp-OutputCollapser jp-Cell-outputCollapser">
     37 </div>
---> 38 {{ super() }}
     39 </div>
     40 {% endblock output_group %}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'output_group'()
     46                                     {%- block output_prompt -%}{%- endblock output_prompt -%}
     47                                 {%- endif -%}
---> 48                                 {%- block outputs scoped -%}
     49                                     {%- for output in cell.outputs -%}
     50                                         {%- block output scoped -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'outputs'()
     42 {% block outputs %}
     43 <div class="jp-OutputArea jp-Cell-outputArea">
---> 44 {{ super() }}
     45 </div>
     46 {% endblock outputs %}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'outputs'()
     48                                 {%- block outputs scoped -%}
     49                                     {%- for output in cell.outputs -%}
---> 50                                         {%- block output scoped -%}
     51                                             {%- if output.output_type == 'execute_result' -%}
     52                                                 {%- block execute_result scoped -%}{%- endblock execute_result -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'output'()
     85     {{ self.output_area_prompt() }}
     86 {% endif %}
---> 87 {{ super() }}
     88 </div>
     89 {% endblock output %}

/usr/local/share/jupyter/nbconvert/templates/base/null.j2 in block 'output'()
     50                                         {%- block output scoped -%}
     51                                             {%- if output.output_type == 'execute_result' -%}
---> 52                                                 {%- block execute_result scoped -%}{%- endblock execute_result -%}
     53                                             {%- elif output.output_type == 'stream' -%}
     54                                                 {%- block stream scoped -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'execute_result'()
    123 {% block execute_result -%}
    124 {%- set extra_class="jp-OutputArea-executeResult" -%}
--> 125 {% block data_priority scoped %}
    126 {{ super() }}
    127 {% endblock data_priority %}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'data_priority'()
    124 {%- set extra_class="jp-OutputArea-executeResult" -%}
    125 {% block data_priority scoped %}
--> 126 {{ super() }}
    127 {% endblock data_priority %}
    128 {%- set extra_class="" -%}

/usr/local/share/jupyter/nbconvert/templates/base/display_priority.j2 in block 'data_priority'()
     10             {%- endblock -%}
     11         {%- elif type == 'image/svg+xml' -%}
---> 12             {%- block data_svg -%}
     13             {%- endblock -%}
     14         {%- elif type == 'image/png' -%}

/usr/local/share/jupyter/nbconvert/templates/lab/base.html.j2 in block 'data_svg'()
    160 <img src="{{ output.svg_filename | posix_path | escape_html }}">
    161 {%- else %}
--> 162 {{ output.data['image/svg+xml'] | clean_html }}
    163 {%- endif %}
    164 </div>

/usr/local/lib/python3.10/dist-packages/lxml/html/clean.cpython-310-x86_64-linux-gnu.so in lxml.html.clean.Cleaner.clean_html()

/usr/local/lib/python3.10/dist-packages/lxml/html/__init__.py in fromstring(html, base_url, parser, **kw)
    871     else:
    872         is_full_html = _looks_like_full_html_unicode(html)
--> 873     doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
    874     if is_full_html:
    875         return doc

/usr/local/lib/python3.10/dist-packages/lxml/html/__init__.py in document_fromstring(html, parser, ensure_head_body, **kw)
    757     if parser is None:
    758         parser = html_parser
--> 759     value = etree.fromstring(html, parser, **kw)
    760     if value is None:
    761         raise etree.ParserError(

src/lxml/etree.pyx in lxml.etree.fromstring()

src/lxml/parser.pxi in lxml.etree._parseMemoryDocument()

ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
In [ ]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta

# Define the start date for the project
start_date = datetime(2024, 7, 1)

# Define the tasks and their durations
tasks = {
    "Draft Chapter 1 & 2": [start_date, start_date + timedelta(weeks=1)],
    "Design Methodology": [start_date + timedelta(weeks=1), start_date + timedelta(weeks=2)],
    "Data Collection & Preprocessing": [start_date + timedelta(weeks=2), start_date + timedelta(weeks=3)],
    "Develop ML Models": [start_date + timedelta(weeks=3), start_date + timedelta(weeks=4)],
    "Model Training & Evaluation": [start_date + timedelta(weeks=4), start_date + timedelta(weeks=5)],
    "Analyze Results & Draft Chapter 5": [start_date + timedelta(weeks=5), start_date + timedelta(weeks=6)],
    "Write Chapters 6 & 7": [start_date + timedelta(weeks=6), start_date + timedelta(weeks=7)],
    "Final Revisions & Submission": [start_date + timedelta(weeks=7), start_date + timedelta(weeks=8)],
}

# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each task
for i, (task, (start, end)) in enumerate(tasks.items()):
    ax.barh(task, (end - start).days, left=start, color='skyblue', edgecolor='black')

# Set the date format on the x-axis
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

# Rotate date labels for better readability
plt.xticks(rotation=45)

# Add labels and title
plt.xlabel('Date')
plt.ylabel('Tasks')
plt.title('8-Week Project Plan Gantt Chart')

# Display the grid
plt.grid(True)

# Show the plot
plt.tight_layout()
plt.show()
No description has been provided for this image